# ------------ Install / Load ------------
if(!require(Boruta)) install.packages("Boruta", dependencies = TRUE)
if(!require(randomForest)) install.packages("randomForest", dependencies = TRUE)
library(Boruta); library(randomForest)

# ------------ Read and clean the data ------------
data_path  <- "E:/DaliOcttoMaydynamic.csv"
target_var <- "SM"                 # Dependent variable column name

# Read all common null value representations as NA.
df <- read.csv(data_path,
               na.strings = c("", "NA", "NaN", "N/A", "NULL", "null", " "),
               stringsAsFactors = FALSE)
str(df)

# Convert SM to numeric (as values may be mixed with characters in Excel/CSV).
# Convert to character first, then to numeric, to avoid incorrect encoding from direct as.numeric conversion of factors.
df[[target_var]] <- as.numeric(as.character(df[[target_var]]))

# Quick check: How many NA/Inf/NaN values are there in total?
cat("Number of NA values in SM: ", sum(is.na(df[[target_var]])), "\n")
cat("Number of non-finite values (Inf/NaN) in SM: ", sum(!is.finite(df[[target_var]]), na.rm = TRUE), "\n")

# Keep only the rows with valid SM values
df <- df[is.finite(df[[target_var]]), , drop = FALSE]

# (Optional) Remove obviously meaningless columns, e.g., columns with all NA or zero variance
all_na_cols <- vapply(df, function(x) all(is.na(x)), logical(1))
if (any(all_na_cols)) df <- df[, !all_na_cols, drop = FALSE]

# If you have date columns and don’t want them as predictors, exclude them
drop_cols <- intersect(c("Date", "date", "DATE"), names(df))
feature_cols <- setdiff(names(df), c(target_var, drop_cols))

# Independent variables must not contain NA, remove rows with NA
keep_rows <- complete.cases(df[, c(target_var, feature_cols), drop = FALSE])
cat("Number of rows removed due to NA: ", sum(!keep_rows), "\n")
df_model <- df[keep_rows, c(target_var, feature_cols), drop = FALSE]

# Double-check to ensure no missing values remain
stopifnot(sum(is.na(df_model[[target_var]])) == 0)

# ------------ Run Boruta ------------
set.seed(123)
form <- as.formula(paste(target_var, "~ ."))

boruta_result <- Boruta(formula = form,
                        data    = df_model,
                        doTrace = 2,
                        ntree   = 500)   # Parameter passed to randomForest

print(boruta_result)
plot(boruta_result, las = 2, cex.axis = 0.7)

# Handle Tentative attributes
boruta_final <- TentativeRoughFix(boruta_result)
print(boruta_final)

confirmed <- getSelectedAttributes(boruta_final, withTentative = FALSE)
cat("Confirmed important features:\n"); print(confirmed)

# Export statistics table
importance_table <- attStats(boruta_final)
write.csv(importance_table, "E:/JJG2022_Boruta_Results.csv", row.names = TRUE)

# ------------ Optional: Beautify boxplot (unchanged) ------------
# If you want to continue plotting, make sure to use boruta_result (before RoughFix)
# ... (your original melt/ggplot code can continue from here)
############ Beautify boxplot
library(reshape2)
library(ggplot2)

## 1) Extract the original Boruta history
imp_history <- boruta_result$ImpHistory

## 2) Convert to long format
imp_long <- melt(as.matrix(imp_history),
                 varnames = c("Iteration", "Attribute"),
                 value.name = "Importance")

## 3) Remove NA and shadow variables
imp_long <- subset(imp_long, !is.na(Importance))
imp_long <- subset(imp_long, !grepl("^shadow", Attribute))

## 4) Sort by median value
ord <- tapply(imp_long$Importance, imp_long$Attribute, median, na.rm = TRUE)
imp_long$Attribute <- factor(imp_long$Attribute, levels = names(sort(ord)))

## 5) Get decisions using the original boruta_result
stats <- attStats(boruta_result)   # Note: do not use TentativeRoughFix() results here
stats$Attribute <- rownames(stats)

imp_long2 <- merge(imp_long, stats[, c("Attribute", "decision")],
                   by = "Attribute", all.x = TRUE)

## 6) Draw boxplot (keep Tentative) - enlarge all font sizes
ggplot(imp_long2, aes(x = Attribute, y = Importance, fill = decision)) +
  geom_boxplot(outlier.size = 1.2, width = 0.6, lwd = 0.5) +
  scale_fill_manual(values = c(Confirmed = "#198E5C",
                               Tentative = "#3498DB",
                               Rejected = "#D43F3B")) +
  # Replace x-axis labels here
  scale_x_discrete(labels = c(
    "Prec." = "Prec.",
    "WS"    = "WS",
    "T2m"   = expression(T[2*m]),   # m as subscript
    "AE"    = "AE",
    "rhu"   = "rhu"
  )) +
  labs(title = "Boruta Variable Importance (Soil Moisture)",
       x = NULL,
       y = "Importance (Z)",
       fill = "Decision") +
  theme_bw(base_size = 16) +
  theme(
    # X-axis text (not bold)
    axis.text.x = element_text(angle = 45, hjust = 1, size = 14, colour = "black"),
    
    # Y-axis text (not bold)
    axis.text.y = element_text(size = 14, colour = "black"),
    
    # Axis titles can remain bold (if desired)
    axis.title.x = element_text(size = 16, face = "bold", margin = margin(t = 10), colour = "black"),
    axis.title.y = element_text(size = 16, face = "bold", margin = margin(r = 10), colour = "black"),
    
    # Others remain unchanged
    legend.title = element_text(size = 16, face = "bold"),
    legend.text  = element_text(size = 14, colour = "black"),
    plot.title   = element_text(size = 18, face = "bold", hjust = 0.5, margin = margin(b = 15), colour = "black"),
    legend.position = "top"
  )
